{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sagemaker\n",
    "from pathlib import Path\n",
    "from sagemaker.predictor import json_serializer\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "role = sagemaker.get_execution_role()\n",
    "sagemaker_session = sagemaker.Session()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Setup Path "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# location for train.csv, val.csv and labels.csv\n",
    "DATA_PATH = Path(\"../sample_data/multi_label_toxic_comments/\")   \n",
    "\n",
    "# Location for storing training_config.json\n",
    "CONFIG_PATH = DATA_PATH/'config'\n",
    "CONFIG_PATH.mkdir(exist_ok=True)\n",
    "\n",
    "# S3 bucket name\n",
    "bucket = sagemaker_session.default_bucket()\n",
    "\n",
    "# Prefix for S3 bucket for input and output\n",
    "prefix = 'toxic_comments/input'\n",
    "prefix_output = 'toxic_comments/output'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Upload Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'s3://sagemaker-us-east-1-835319576252/toxic_comments/input/labels.csv'"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# This is a helper feature to upload data\n",
    "# from your local machine to S3 bucket.\n",
    "\n",
    "s3_input = sagemaker_session.upload_data(DATA_PATH, bucket=bucket , key_prefix=prefix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'s3://sagemaker-us-east-1-835319576252/toxic_comments/input/labels.csv'"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sagemaker_session.upload_data(str(DATA_PATH/'label/labels.csv'), bucket=bucket , key_prefix=prefix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'s3://sagemaker-us-east-1-835319576252/toxic_comments/input/train_sample.csv'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sagemaker_session.upload_data(str(DATA_PATH/'data/train_sample.csv'), bucket=bucket , key_prefix=prefix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'s3://sagemaker-us-east-1-835319576252/toxic_comments/input/val_sample.csv'"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sagemaker_session.upload_data(str(DATA_PATH/'data/val_sample.csv'), bucket=bucket , key_prefix=prefix)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Hyperparameters & Training Config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "hyperparameters = {\n",
    "    \"epochs\": 10,\n",
    "    \"lr\": 8e-5,\n",
    "    \"max_seq_length\": 512,\n",
    "    \"train_batch_size\": 16,\n",
    "    \"lr_schedule\": \"warmup_cosine\",\n",
    "    \"warmup_steps\": 1000,\n",
    "    \"optimizer_type\": \"adamw\"\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_config = {\n",
    "    \"run_text\": \"toxic comments\",\n",
    "    \"finetuned_model\": None,\n",
    "    \"do_lower_case\": \"True\",\n",
    "    \"train_file\": \"train_sample.csv\",\n",
    "    \"val_file\": \"val_sample.csv\",\n",
    "    \"label_file\": \"labels.csv\",\n",
    "    \"text_col\": \"comment_text\",\n",
    "    \"label_col\": '[\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]',\n",
    "    \"multi_label\": \"True\",\n",
    "    \"grad_accumulation_steps\": \"1\",\n",
    "    \"fp16_opt_level\": \"O1\",\n",
    "    \"fp16\": \"True\",\n",
    "    \"model_type\": \"roberta\",\n",
    "    \"model_name\": \"roberta-base\",\n",
    "    \"logging_steps\": \"300\"\n",
    "}\n",
    "\n",
    "with open(CONFIG_PATH/'training_config.json', 'w') as f:\n",
    "    json.dump(training_config, f)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create an Estimator and start training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "835319576252.dkr.ecr.us-east-1.amazonaws.com/fluent-sagemaker-fast-bert:1.0-gpu-py36\n"
     ]
    }
   ],
   "source": [
    "account = sagemaker_session.boto_session.client('sts').get_caller_identity()['Account']\n",
    "region = sagemaker_session.boto_session.region_name\n",
    "\n",
    "image = \"{}.dkr.ecr.{}.amazonaws.com/fluent-sagemaker-fast-bert:1.0-gpu-py36\".format(account, region)\n",
    "print(image)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "output_path = \"s3://{}/{}\".format(bucket, prefix_output)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "estimator = sagemaker.estimator.Estimator(image, \n",
    "                                          role,\n",
    "                                          train_instance_count=1, \n",
    "                                          train_instance_type='ml.p3.8xlarge', \n",
    "                                          output_path=output_path,\n",
    "                                          base_job_name='toxic-comments',\n",
    "                                          enable_sagemaker_metrics=True,\n",
    "                                          hyperparameters=hyperparameters,\n",
    "                                          sagemaker_session=sagemaker_session\n",
    "                                         )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2020-01-31 18:49:05 Starting - Starting the training job...\n",
      "2020-01-31 18:49:07 Starting - Launching requested ML instances......\n",
      "2020-01-31 18:50:13 Starting - Preparing the instances for training......\n",
      "2020-01-31 18:51:27 Downloading - Downloading input data\n",
      "2020-01-31 18:51:27 Training - Downloading the training image...............\n",
      "2020-01-31 18:54:03 Uploading - Uploading generated training model\n",
      "2020-01-31 18:54:03 Failed - Training job failed\n",
      "\u001b[34mStarting the training.\u001b[0m\n",
      "\u001b[34m/opt/ml/input/data/training/config/training_config.json\u001b[0m\n",
      "\u001b[34m{'run_text': 'toxic comments', 'finetuned_model': None, 'do_lower_case': 'True', 'train_file': 'train_sample.csv', 'val_file': 'val_sample.csv', 'label_file': 'labels.csv', 'text_col': 'comment_text', 'label_col': '[\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]', 'multi_label': 'True', 'grad_accumulation_steps': '1', 'fp16_opt_level': 'O1', 'fp16': 'True', 'model_type': 'roberta', 'model_name': 'roberta-base', 'logging_steps': '300'}\u001b[0m\n",
      "\u001b[34m{'train_batch_size': '16', 'warmup_steps': '1000', 'lr': '8e-05', 'max_seq_length': '512', 'optimizer_type': 'adamw', 'lr_schedule': 'warmup_cosine', 'epochs': '10'}\u001b[0m\n",
      "\u001b[34m01/31/2020 18:53:58 - INFO - root -   model path used /opt/ml/code/pretrained_models/roberta-base\u001b[0m\n",
      "\u001b[34m01/31/2020 18:53:58 - INFO - root -   finetuned model not available - loading standard pretrained model\u001b[0m\n",
      "\u001b[34m01/31/2020 18:53:58 - INFO - transformers.tokenization_utils -   Model name '/opt/ml/code/pretrained_models/roberta-base' not found in model shortcut name list (roberta-base, roberta-large, roberta-large-mnli, distilroberta-base, roberta-base-openai-detector, roberta-large-openai-detector). Assuming '/opt/ml/code/pretrained_models/roberta-base' is a path, a model identifier, or url to a directory containing tokenizer files.\u001b[0m\n",
      "\u001b[34mException during training: 'PosixPath' object has no attribute 'decode'\u001b[0m\n",
      "\u001b[34mTraceback (most recent call last):\n",
      "  File \"/opt/ml/code/train\", line 138, in train\n",
      "    PRETRAINED_PATH, do_lower_case=bool(training_config[\"do_lower_case\"])\n",
      "  File \"/opt/conda/lib/python3.6/site-packages/transformers/tokenization_utils.py\", line 309, in from_pretrained\n",
      "    return cls._from_pretrained(*inputs, **kwargs)\n",
      "  File \"/opt/conda/lib/python3.6/site-packages/transformers/tokenization_utils.py\", line 339, in _from_pretrained\n",
      "    if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):\n",
      "  File \"/opt/conda/lib/python3.6/site-packages/transformers/file_utils.py\", line 143, in is_remote_url\n",
      "    parsed = urlparse(url_or_filename)\n",
      "  File \"/opt/conda/lib/python3.6/urllib/parse.py\", line 367, in urlparse\n",
      "    url, scheme, _coerce_result = _coerce_args(url, scheme)\n",
      "  File \"/opt/conda/lib/python3.6/urllib/parse.py\", line 123, in _coerce_args\n",
      "    return _decode_args(args) + (_encode_result,)\n",
      "  File \"/opt/conda/lib/python3.6/urllib/parse.py\", line 107, in _decode_args\n",
      "    return tuple(x.decode(encoding, errors) if x else '' for x in args)\n",
      "  File \"/opt/conda/lib/python3.6/urllib/parse.py\", line 107, in <genexpr>\n",
      "    return tuple(x.decode(encoding, errors) if x else '' for x in args)\u001b[0m\n",
      "\u001b[34mAttributeError: 'PosixPath' object has no attribute 'decode'\n",
      "\u001b[0m\n"
     ]
    },
    {
     "ename": "UnexpectedStatusException",
     "evalue": "Error for Training job toxic-comments-2020-01-31-18-49-05-491: Failed. Reason: AlgorithmError: Exception during training: 'PosixPath' object has no attribute 'decode'\nTraceback (most recent call last):\n  File \"/opt/ml/code/train\", line 138, in train\n    PRETRAINED_PATH, do_lower_case=bool(training_config[\"do_lower_case\"])\n  File \"/opt/conda/lib/python3.6/site-packages/transformers/tokenization_utils.py\", line 309, in from_pretrained\n    return cls._from_pretrained(*inputs, **kwargs)\n  File \"/opt/conda/lib/python3.6/site-packages/transformers/tokenization_utils.py\", line 339, in _from_pretrained\n    if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):\n  File \"/opt/conda/lib/python3.6/site-packages/transformers/file_utils.py\", line 143, in is_remote_url\n    parsed = urlparse(url_or_filename)\n  File \"/opt/conda/lib/python3.6/urllib/parse.py\", line 367, in urlparse\n    url, scheme, _coerce_result = _coerce_args(url, scheme)\n  File \"/opt/conda/lib/python3.6/urllib/parse.py\", line 123, in _coerce_args\n    return _decode_args(args) + (_encode_result",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mUnexpectedStatusException\u001b[0m                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-20-41efce4517f5>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mestimator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms3_input\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m~/.local/lib/python3.6/site-packages/sagemaker/estimator.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, inputs, wait, logs, job_name, experiment_config)\u001b[0m\n\u001b[1;32m    463\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjobs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlatest_training_job\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    464\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 465\u001b[0;31m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlatest_training_job\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlogs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlogs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    466\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    467\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_compilation_job_name\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.6/site-packages/sagemaker/estimator.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, logs)\u001b[0m\n\u001b[1;32m   1061\u001b[0m         \u001b[0;31m# If logs are requested, call logs_for_jobs.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1062\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mlogs\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;34m\"None\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1063\u001b[0;31m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msagemaker_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlogs_for_job\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlog_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlogs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1064\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1065\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msagemaker_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait_for_job\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.6/site-packages/sagemaker/session.py\u001b[0m in \u001b[0;36mlogs_for_job\u001b[0;34m(self, job_name, wait, poll, log_type)\u001b[0m\n\u001b[1;32m   3019\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3020\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3021\u001b[0;31m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_job_status\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdescription\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"TrainingJobStatus\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   3022\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mdot\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3023\u001b[0m                 \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.local/lib/python3.6/site-packages/sagemaker/session.py\u001b[0m in \u001b[0;36m_check_job_status\u001b[0;34m(self, job, desc, status_key_name)\u001b[0m\n\u001b[1;32m   2613\u001b[0m                 ),\n\u001b[1;32m   2614\u001b[0m                 \u001b[0mallowed_statuses\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Completed\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Stopped\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2615\u001b[0;31m                 \u001b[0mactual_status\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstatus\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2616\u001b[0m             )\n\u001b[1;32m   2617\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mUnexpectedStatusException\u001b[0m: Error for Training job toxic-comments-2020-01-31-18-49-05-491: Failed. Reason: AlgorithmError: Exception during training: 'PosixPath' object has no attribute 'decode'\nTraceback (most recent call last):\n  File \"/opt/ml/code/train\", line 138, in train\n    PRETRAINED_PATH, do_lower_case=bool(training_config[\"do_lower_case\"])\n  File \"/opt/conda/lib/python3.6/site-packages/transformers/tokenization_utils.py\", line 309, in from_pretrained\n    return cls._from_pretrained(*inputs, **kwargs)\n  File \"/opt/conda/lib/python3.6/site-packages/transformers/tokenization_utils.py\", line 339, in _from_pretrained\n    if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):\n  File \"/opt/conda/lib/python3.6/site-packages/transformers/file_utils.py\", line 143, in is_remote_url\n    parsed = urlparse(url_or_filename)\n  File \"/opt/conda/lib/python3.6/urllib/parse.py\", line 367, in urlparse\n    url, scheme, _coerce_result = _coerce_args(url, scheme)\n  File \"/opt/conda/lib/python3.6/urllib/parse.py\", line 123, in _coerce_args\n    return _decode_args(args) + (_encode_result"
     ]
    }
   ],
   "source": [
    "# This is throwing the following error with both py3.7 (original) and py3.6 (i changed the docker image to use this)\n",
    "estimator.fit(s3_input)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Deploy the model to hosting service"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictor = estimator.deploy(1, \n",
    "                             'ml.m5.large', \n",
    "                             endpoint_name='bert-toxic-comments', \n",
    "                             update_endpoint=True, \n",
    "                             serializer=json_serializer)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_python3",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
